-
Notifications
You must be signed in to change notification settings - Fork 41
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add nextafter
intrinsic
#529
base: main
Are you sure you want to change the base?
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index ecd5e44d..ca30165b 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -354,45 +354,45 @@ end
end
- let # nextafter
- function nextafter_test(X, y)
- idx = thread_position_in_grid_1d()
- X[idx] = Metal.nextafter(X[idx], y)
- return nothing
- end
+ let # nextafter
+ function nextafter_test(X, y)
+ idx = thread_position_in_grid_1d()
+ X[idx] = Metal.nextafter(X[idx], y)
+ return nothing
+ end
- # Check the code is generated as expected
- outval = T(0)
- function nextafter_out_test()
- Metal.nextafter(outval, outval)
- return
- end
+ # Check the code is generated as expected
+ outval = T(0)
+ function nextafter_out_test()
+ Metal.nextafter(outval, outval)
+ return
+ end
- N = 4
- arr = rand(T, N)
+ N = 4
+ arr = rand(T, N)
- # test the intrinsic (macOS >= v14)
- if metal_support() >= v"3.1"
- buffer1 = MtlArray(arr)
- Metal.@sync @metal threads = N nextafter_test(buffer1, typemax(T))
- @test Array(buffer1) == nextfloat.(arr)
- Metal.@sync @metal threads = N nextafter_test(buffer1, typemin(T))
- @test Array(buffer1) == arr
+ # test the intrinsic (macOS >= v14)
+ if metal_support() >= v"3.1"
+ buffer1 = MtlArray(arr)
+ Metal.@sync @metal threads = N nextafter_test(buffer1, typemax(T))
+ @test Array(buffer1) == nextfloat.(arr)
+ Metal.@sync @metal threads = N nextafter_test(buffer1, typemin(T))
+ @test Array(buffer1) == arr
- ir = sprint(io->(@device_code_llvm io=io dump_module=true @metal nextafter_out_test()))
- @test occursin(Regex("@air\\.nextafter\\.f$(8*sizeof(T))"), ir)
- end
+ ir = sprint(io -> (@device_code_llvm io = io dump_module = true @metal nextafter_out_test()))
+ @test occursin(Regex("@air\\.nextafter\\.f$(8 * sizeof(T))"), ir)
+ end
- # test for metal < 3.1
- buffer2 = MtlArray(arr)
- Metal.@sync @metal threads = N metal = v"3.0" nextafter_test(buffer2, typemax(T))
- @test Array(buffer2) == nextfloat.(arr)
- Metal.@sync @metal threads = N metal = v"3.0" nextafter_test(buffer2, typemin(T))
- @test Array(buffer2) == arr
+ # test for metal < 3.1
+ buffer2 = MtlArray(arr)
+ Metal.@sync @metal threads = N metal = v"3.0" nextafter_test(buffer2, typemax(T))
+ @test Array(buffer2) == nextfloat.(arr)
+ Metal.@sync @metal threads = N metal = v"3.0" nextafter_test(buffer2, typemin(T))
+ @test Array(buffer2) == arr
- ir = sprint(io->(@device_code_llvm io=io dump_module=true @metal metal = v"3.0" nextafter_out_test()))
- @test occursin(Regex("@air\\.sign\\.f$(8*sizeof(T))"), ir)
- end
+ ir = sprint(io -> (@device_code_llvm io = io dump_module = true @metal metal = v"3.0" nextafter_out_test()))
+ @test occursin(Regex("@air\\.sign\\.f$(8 * sizeof(T))"), ir)
+ end
end
end
|
2569fe1
to
18279c9
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
Benchmark suite | Current: 78ba579 | Previous: 6654291 | Ratio |
---|---|---|---|
private array/construct |
24520.833333333332 ns |
26250 ns |
0.93 |
private array/broadcast |
461167 ns |
462084 ns |
1.00 |
private array/random/randn/Float32 |
929084 ns |
826208 ns |
1.12 |
private array/random/randn!/Float32 |
600000 ns |
627791.5 ns |
0.96 |
private array/random/rand!/Int64 |
550792 ns |
558250 ns |
0.99 |
private array/random/rand!/Float32 |
552917 ns |
587500 ns |
0.94 |
private array/random/rand/Int64 |
927583 ns |
777229 ns |
1.19 |
private array/random/rand/Float32 |
812042 ns |
634042 ns |
1.28 |
private array/copyto!/gpu_to_gpu |
541979 ns |
658833 ns |
0.82 |
private array/copyto!/cpu_to_gpu |
726729 ns |
709208 ns |
1.02 |
private array/copyto!/gpu_to_cpu |
627542 ns |
665625 ns |
0.94 |
private array/accumulate/1d |
1411750 ns |
1338542 ns |
1.05 |
private array/accumulate/2d |
1490354.5 ns |
1395208 ns |
1.07 |
private array/iteration/findall/int |
2261042 ns |
2084458 ns |
1.08 |
private array/iteration/findall/bool |
2033000 ns |
1817562.5 ns |
1.12 |
private array/iteration/findfirst/int |
1796354.5 ns |
1711563 ns |
1.05 |
private array/iteration/findfirst/bool |
1720270.5 ns |
1674417 ns |
1.03 |
private array/iteration/scalar |
2553979.5 ns |
3782854 ns |
0.68 |
private array/iteration/logical |
3465479.5 ns |
3254708.5 ns |
1.06 |
private array/iteration/findmin/1d |
1863167 ns |
1778417 ns |
1.05 |
private array/iteration/findmin/2d |
1411791 ns |
1351834 ns |
1.04 |
private array/reductions/reduce/1d |
943937.5 ns |
1034625 ns |
0.91 |
private array/reductions/reduce/2d |
704208.5 ns |
663541 ns |
1.06 |
private array/reductions/mapreduce/1d |
960000 ns |
1036542 ns |
0.93 |
private array/reductions/mapreduce/2d |
697500 ns |
667125 ns |
1.05 |
private array/permutedims/4d |
2650771 ns |
2546416 ns |
1.04 |
private array/permutedims/2d |
1092583 ns |
1022500 ns |
1.07 |
private array/permutedims/3d |
1820937 ns |
1618208 ns |
1.13 |
private array/copy |
831229 ns |
582708 ns |
1.43 |
latency/precompile |
9048827084 ns |
8868498250 ns |
1.02 |
latency/ttfp |
3641187000 ns |
3613257333 ns |
1.01 |
latency/import |
1250856167 ns |
1235355708 ns |
1.01 |
integration/metaldevrt |
749208.5 ns |
721583 ns |
1.04 |
integration/byval/slices=1 |
1677437.5 ns |
1558729.5 ns |
1.08 |
integration/byval/slices=3 |
20036458 ns |
9618396 ns |
2.08 |
integration/byval/reference |
1663417 ns |
1546854.5 ns |
1.08 |
integration/byval/slices=2 |
2795708 ns |
2557458 ns |
1.09 |
kernel/indexing |
459583 ns |
476250 ns |
0.97 |
kernel/indexing_checked |
471500 ns |
466417 ns |
1.01 |
kernel/launch |
8000 ns |
8125 ns |
0.98 |
metal/synchronization/stream |
15042 ns |
14667 ns |
1.03 |
metal/synchronization/context |
15375 ns |
14834 ns |
1.04 |
shared array/construct |
23565.916666666664 ns |
24854.166666666668 ns |
0.95 |
shared array/broadcast |
461833 ns |
455833 ns |
1.01 |
shared array/random/randn/Float32 |
919583 ns |
820500 ns |
1.12 |
shared array/random/randn!/Float32 |
589437.5 ns |
639041 ns |
0.92 |
shared array/random/rand!/Int64 |
554000 ns |
549729.5 ns |
1.01 |
shared array/random/rand!/Float32 |
562584 ns |
601250 ns |
0.94 |
shared array/random/rand/Int64 |
841500 ns |
751833.5 ns |
1.12 |
shared array/random/rand/Float32 |
799458 ns |
608854 ns |
1.31 |
shared array/copyto!/gpu_to_gpu |
80167 ns |
84542 ns |
0.95 |
shared array/copyto!/cpu_to_gpu |
81833 ns |
81417 ns |
1.01 |
shared array/copyto!/gpu_to_cpu |
81958 ns |
85333.5 ns |
0.96 |
shared array/accumulate/1d |
1401208 ns |
1347916.5 ns |
1.04 |
shared array/accumulate/2d |
1490959 ns |
1396125 ns |
1.07 |
shared array/iteration/findall/int |
2010292 ns |
1836562.5 ns |
1.09 |
shared array/iteration/findall/bool |
1733833 ns |
1595583 ns |
1.09 |
shared array/iteration/findfirst/int |
1516791.5 ns |
1408417 ns |
1.08 |
shared array/iteration/findfirst/bool |
1433979 ns |
1371458 ns |
1.05 |
shared array/iteration/scalar |
161395.5 ns |
154834 ns |
1.04 |
shared array/iteration/logical |
3253750 ns |
2978125 ns |
1.09 |
shared array/iteration/findmin/1d |
1570458 ns |
1471042 ns |
1.07 |
shared array/iteration/findmin/2d |
1434229.5 ns |
1369917 ns |
1.05 |
shared array/reductions/reduce/1d |
668292 ns |
738208 ns |
0.91 |
shared array/reductions/reduce/2d |
697834 ns |
670417 ns |
1.04 |
shared array/reductions/mapreduce/1d |
699521 ns |
732854 ns |
0.95 |
shared array/reductions/mapreduce/2d |
696104.5 ns |
666916.5 ns |
1.04 |
shared array/permutedims/4d |
2629917 ns |
2545729.5 ns |
1.03 |
shared array/permutedims/2d |
1106083 ns |
1007791 ns |
1.10 |
shared array/permutedims/3d |
1817417 ns |
1583166 ns |
1.15 |
shared array/copy |
210833 ns |
244917 ns |
0.86 |
This comment was automatically generated by workflow using github-action-benchmark.
54e1adc
to
43147a3
Compare
Error doesn't seem related:
|
43147a3
to
987b73b
Compare
9eff7d2
to
f4b4a58
Compare
bb6e7fa
to
5354003
Compare
This is still ready for review. Tests pas locally on macOS 13 |
Tests should still pass when run on macOS 13
e581e2b
to
78ba579
Compare
#373